8  Logistic regression: Others

# Importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

sns.set()

8.1 Decision Boundary in Classification

# Importing the dataset
dataset = pd.read_csv('datasets/apples_and_oranges.csv')
dataset.head()
Weight Size Class
0 69 4.39 orange
1 69 4.21 orange
2 65 4.09 orange
3 72 5.85 apple
4 67 4.70 orange
# No. of apples and oranges
dataset['Class'].value_counts()
Class
orange    20
apple     20
Name: count, dtype: int64

8.1.1 Encoding Target

le = LabelEncoder()
dataset['Class'] = le.fit_transform(dataset['Class'])
le.classes_
array(['apple', 'orange'], dtype=object)

This implies that, * 0 represents Apple * 1 represents Orange

dataset.head()
Weight Size Class
0 69 4.39 1
1 69 4.21 1
2 65 4.09 1
3 72 5.85 0
4 67 4.70 1

8.1.2 Plotting the dataset

plt.figure(figsize=(9,6))
plt.title('Apples and Oranges', fontweight='bold', fontsize=16)
plt.xlabel('Weight')
plt.ylabel('Size')
scatter = plt.scatter(dataset['Weight'], dataset['Size'], c=dataset['Class'], cmap='viridis')
plt.legend(*scatter.legend_elements(),
           loc = 'upper left',
           title = 'Class');

We can observe that oranges have lower weight and size compared to apples. Further by drawing a straight line between these two groups of data points, we can clearly distinguish between apples and oranges.

8.1.3 Building a Logistic Regression model to distinguish apples and oranges

As we can clearly distinguish between apples and oranges using a straight line decision boundary, we can choose the hypothesis y = a0 + a1 x1 + a2 * x2* for Logistic Regression
where,
a0, a1, a2 are the fitting parameters
x1 is Weight
x2 is Size

# Defining target and features
y = dataset['Class']
x = dataset.drop(columns=['Class'])
# Creating object of LogisticRegression class
log_reg = LogisticRegression()
# Fitting parameters
log_reg.fit(x,y)
LogisticRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
# Intercept - a0
log_reg.intercept_
array([106.60287324])
# Coefficients - a1, a2 respectively
log_reg.coef_
array([[-1.42833694, -1.31285258]])
# Predicting labels for the given dataset
label_predictions = log_reg.predict(x)
label_predictions
array([1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1,
       1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0])

8.1.4 Linear Decision Boundary with naive features

# Parameter values
a0 = log_reg.intercept_[0]
a1 = log_reg.coef_[0][0]
a2 = log_reg.coef_[0][1]
# Defining x1 and x2 values for decision boundary
x1 = np.array([69, 71])
x2 = - (a0 / a2) - (a1 / a2) * x1
# Plotting the decision boundary
plt.figure(figsize=(9,6))
plt.title('Apples and Oranges', fontweight='bold', fontsize=16)
plt.xlabel('Weight')
plt.ylabel('Size')
scatter = plt.scatter(dataset['Weight'], dataset['Size'], c=dataset['Class'], cmap='viridis')
plt.legend(*scatter.legend_elements(),
           loc = 'upper left',
           title = 'Class')
plt.plot(x1, x2, color='red', label='Decision Boundary')
plt.show()

In this problem, we have just two features x1 and x2, if we use just those as they are we will end up with a straight line which divides our 2D plane into two half-planes.

8.2 Non-linear Decision Boundary

In some occasions, we want to have a more complex boundary, and we can achieve this by transforming our features. For instance, when confronted with a training data distribution as illustrated below, image-2.png it becomes imperative to generate polynomial features \[(x_1^2, x_2^2)\] in order to enhance the delineation between the two classes. image-3.png

Let’s delve into a concrete example below to illustrate this concept.For the purpose of illustrating the decision boundary, I chose not to split the data into training and test sets.

#Load our Dataset for Logistic Regression
components = pd.read_csv('datasets/ex2data2.txt', header=None, names = ['feature 1', 'feature 2', 'faulty'])
components.head()
feature 1 feature 2 faulty
0 0.051267 0.69956 1
1 -0.092742 0.68494 1
2 -0.213710 0.69225 1
3 -0.375000 0.50219 1
4 -0.513250 0.46564 1
# check the balance of the dataset
components['faulty'].value_counts()
faulty
0    60
1    58
Name: count, dtype: int64
# get positive and negative samples for plotting
pos = components['faulty'] == 1
neg = components['faulty'] == 0
# Visualize Data
fig, axes = plt.subplots();
axes.set_xlabel('Feature 1')
axes.set_ylabel('Feature 2')
axes.scatter(components.loc[pos, 'feature 1'], components.loc[pos, 'feature 2'], color = 'r', marker='x', label='Faulty')
axes.scatter(components.loc[neg, 'feature 1'], components.loc[neg, 'feature 2'], color = 'g', marker='o', label='Non Faculty')
axes.legend(title='Legend', loc = 'best' )
axes.set_xlim(-1,1.5)
axes.set_xlim(-1,1.5)

As we can see that the positive and negative examples are not linearly seperable. So we have to add additional higher order polynomial features.

# define function to map higher order polynomial features
def mapFeature(X1, X2, degree):
    res = np.ones(X1.shape[0])
    for i in range(1,degree + 1):
        for j in range(0,i + 1):
            res = np.column_stack((res, (X1 ** (i-j)) * (X2 ** j)))
    
    return res
# Get the features 
X = components.iloc[:, :2]
degree = 2
X_poly = mapFeature(X.iloc[:, 0], X.iloc[:, 1], degree)
# Get the target variable
y = components.iloc[:, 2]
def sigmoid(z):
    return 1 / (1 + np.exp(-z))
def costFunc(theta, X, y):
    m = y.shape[0]
    z = X.dot(theta)
    h = sigmoid(z)
    term1 = y * np.log(h)
    term2 = (1- y) * np.log(1 - h)
    J = -np.sum(term1 + term2, axis = 0) / m
    return J 
# Set initial values for our parameters
initial_theta = np.zeros(X_poly.shape[1]).reshape(X_poly.shape[1], 1)
# Now call the optimization routine
#NOTE: This automatically picks the learning rate
from scipy.optimize import minimize
res = minimize(costFunc, initial_theta.flatten(), args=(X_poly, y))
# our optimizated coefficients
theta = res.x
# define a function to plot the decision boundary
def plotDecisionBoundary(theta,degree, axes):
    u = np.linspace(-1, 1.5, 50)
    v = np.linspace(-1, 1.5, 50)
    U,V = np.meshgrid(u,v)
    # convert U, V to vectors for calculating additional features
    # using vectorized implementation
    U = np.ravel(U)
    V = np.ravel(V)
    Z = np.zeros((len(u) * len(v)))
    
    X_poly = mapFeature(U, V, degree)
    Z = X_poly.dot(theta)
    
    # reshape U, V, Z back to matrix
    U = U.reshape((len(u), len(v)))
    V = V.reshape((len(u), len(v)))
    Z = Z.reshape((len(u), len(v)))
    
    cs = axes.contour(U,V,Z,levels=[0],cmap= "Greys_r")
    axes.legend(labels=['Non Faculty', 'faulty', 'Decision Boundary'])
    return cs
# Plot Decision boundary
fig, axes = plt.subplots();
axes.set_xlabel('Feature 1')
axes.set_ylabel('Feature 2')
axes.scatter(components.loc[pos, 'feature 1'], components.loc[pos, 'feature 2'], color = 'r', marker='x', label='Faulty')
axes.scatter(components.loc[neg, 'feature 1'], components.loc[neg, 'feature 2'], color = 'g', marker='o', label='Good')
#axes.legend(title='Legend', loc = 'best' )

plotDecisionBoundary(theta, degree, axes);

of course, you can increase the degree of the polynomial you want to fit, but the overfitting could become a problem

# set degree = 6
degree = 6
# map features to the degree
X_poly = mapFeature(X.iloc[:, 0], X.iloc[:, 1], degree)
# set initial parameters
initial_theta = np.zeros(X_poly.shape[1]).reshape(X_poly.shape[1], 1)
# Run the optimization function
res = minimize(costFunc, initial_theta.flatten(), args=(X_poly, y))
theta = res.x.reshape(res.x.shape[0], 1)

# Plot Decision boundary
fig, axes = plt.subplots()
axes.set_xlabel('Feature 1')
axes.set_ylabel('Feature 2')
axes.scatter(components.loc[pos, 'feature 1'], components.loc[pos, 'feature 2'], color='r', marker='x', label='Faulty')
axes.scatter(components.loc[neg, 'feature 1'], components.loc[neg, 'feature 2'], color='g', marker='o', label='Good')
#axes.legend(title='Legend', loc='best')

plotDecisionBoundary(theta, degree, axes)

As we can see the model tries pretty hard to capture every single example perfectly and overfits the data. This kind of model has overfitting issue. i.e The model has not pre-conceived notion about the seperation of the positive and negative examples and pretty much can fit any kind of data. Such model will fail in predicting the correct classification when it sees new examples.

One of techiques is to use regularization, which we will cover later. The idea is to penalize the algorithm when it tries to overfit by adding a regularization term to the cost function.

The New Cost function with the regularization is specified as

\(J(\theta ) = \frac{1}{m} \sum_{i=1}^m[-y_i log(h_\theta (z_i) – (1 – y_i) log(1-h_\theta (z_i))] + \frac{\lambda}{2m} \sum_{j=1}^n[\theta_j^2]\)

where \(\lambda\) = regularization factor
n = number of features.
(NOTE: The regularization term does include the intercept term \(\theta_0\)

8.2.1 By adding Polynomial Features

components
feature 1 feature 2 faulty
0 0.051267 0.699560 1
1 -0.092742 0.684940 1
2 -0.213710 0.692250 1
3 -0.375000 0.502190 1
4 -0.513250 0.465640 1
... ... ... ...
113 -0.720620 0.538740 0
114 -0.593890 0.494880 0
115 -0.484450 0.999270 0
116 -0.006336 0.999270 0
117 0.632650 -0.030612 0

118 rows × 3 columns

X = components[['feature 1', 'feature 2']]
y = components['faulty']
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)
feature_names = poly.get_feature_names_out()
print(feature_names)
['1' 'feature 1' 'feature 2' 'feature 1^2' 'feature 1 feature 2'
 'feature 2^2']
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_poly, y)
LogisticRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
label_predictions = model.predict(X_poly)

8.2.1.1 Accuracy Score

accuracy_score(y, label_predictions)
0.8135593220338984

8.2.1.2 Confusion Matrix

cm = confusion_matrix(y, label_predictions)
cm
array([[47, 13],
       [ 9, 49]], dtype=int64)

8.2.2 By Transforming Continuous Variable

Variable transformation is an important technique to create robust models using logistic regression. Because the predictors are linear in the log of the odds, it is often helpful to transform the continuous variables to create a more linear relationship. To determine the best transformation of a continuous variable, a univariate plot is very helpful. Remember the nice univariate plot of Y variable against X variable in linear regression? This is not easily attained, because Y is dichotomous in logistic regression.

There are different recommended solutions. Among them, * One is to create several variations (in forms of squared, cubed, or logged transformations etc.). * Another solution is to break some continuous variables into segments and treat them as categorical variables. This may work well to pick up nonlinear trends. The biggest drawback is that it loses the benefit of the linear trend relationship in the curve1. It also may lead to over fitting.

train = pd.read_csv('./Datasets/Social_Network_Ads_train.csv') #Develop the model on train data
test = pd.read_csv('./Datasets/Social_Network_Ads_test.csv') #Test the model on test data
train.head()
User ID Gender Age EstimatedSalary Purchased
0 15755018 Male 36 33000 0
1 15697020 Female 39 61000 0
2 15796351 Male 36 118000 1
3 15665760 Male 39 122000 1
4 15794661 Female 26 118000 0
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   User ID          300 non-null    int64 
 1   Gender           300 non-null    object
 2   Age              300 non-null    int64 
 3   EstimatedSalary  300 non-null    int64 
 4   Purchased        300 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 11.8+ KB
train.Gender.value_counts()
Gender
Female    151
Male      149
Name: count, dtype: int64
tmp_1 = pd.get_dummies(train['Gender'], drop_first=True)
train = pd.concat([train, tmp_1], axis=1)
train.head()
User ID Gender Age EstimatedSalary Purchased Male
0 15755018 Male 36 33000 0 True
1 15697020 Female 39 61000 0 False
2 15796351 Male 36 118000 1 True
3 15665760 Male 39 122000 1 True
4 15794661 Female 26 118000 0 False
tmp_1 = pd.get_dummies(test['Gender'], drop_first=True)
test = pd.concat([test, tmp_1], axis=1)
test.head()
User ID Gender Age EstimatedSalary Purchased Male
0 15810944 Male 35 20000 0 True
1 15668575 Female 26 43000 0 False
2 15603246 Female 27 57000 0 False
3 15694829 Female 32 150000 1 False
4 15697686 Male 29 80000 0 True
# Separating features and target on training set
y_train = train.Purchased
X_train = train.drop(["Purchased", "Gender", "User ID"], axis = 1)
X_train
Age EstimatedSalary Male
0 36 33000 True
1 39 61000 False
2 36 118000 True
3 39 122000 True
4 26 118000 False
... ... ... ...
295 48 96000 False
296 42 149000 True
297 28 79000 True
298 51 134000 False
299 33 28000 False

300 rows × 3 columns

# Separating features and target on test set
y_test = test.Purchased
X_test = test.drop(["Purchased", "Gender", "User ID"], axis = 1)
X_test
Age EstimatedSalary Male
0 35 20000 True
1 26 43000 False
2 27 57000 False
3 32 150000 False
4 29 80000 True
... ... ... ...
95 49 39000 False
96 47 34000 True
97 60 42000 True
98 39 59000 False
99 51 23000 True

100 rows × 3 columns

from sklearn.linear_model import LogisticRegression
sklearn_model = LogisticRegression()
sklearn_model.fit(X_train, y_train)
LogisticRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

y_pred_test = sklearn_model.predict(X_test)
print('Accuracy of logistic regression on test set : {:.4f}'.format(accuracy_score(y_test, y_pred_test )))
Accuracy of logistic regression on test set : 0.8800

8.2.2.1 Log transformation of salary


sns.histplot(train.EstimatedSalary)

train["log_salary"] = np.log(train["EstimatedSalary"])
sns.histplot(train.log_salary)

The reason for such transformations have nothing to do with their distribution. Instead, the reason has to do with the functional form of the effect. Say we want to know the effect of the number of publications on the probability of getting tenure. It is reasonable to believe that getting an extra publication when one has only 1 publication has more impact compared with getting an extra publication when one has already published 50 articles. The log transformation is one way to capture such a (testable) assumption of diminishing returns.

test["log_salary"] = np.log(test["EstimatedSalary"])
# Separating features and target
y_train = train.Purchased
X_train = train.drop(["Purchased", "Gender", "User ID", "EstimatedSalary"], axis = 1)
X_train
Age Male log_salary
0 36 True 10.404263
1 39 False 11.018629
2 36 True 11.678440
3 39 True 11.711776
4 26 False 11.678440
... ... ... ...
295 48 False 11.472103
296 42 True 11.911702
297 28 True 11.277203
298 51 False 11.805595
299 33 False 10.239960

300 rows × 3 columns

from sklearn.linear_model import LogisticRegression
sklearn_model_log = LogisticRegression()
sklearn_model_log.fit(X_train, y_train)
LogisticRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
# Separating features and target for the test dataset
y_test_log = test.Purchased
X_test_log = test.drop(["Purchased", "Gender", "User ID", "EstimatedSalary"], axis = 1)
y_log_pred_test = sklearn_model_log.predict(X_test_log)
print('Accuracy of logistic regression after log transformation of salary on test set : {:.4f}'.format(accuracy_score( y_test_log, y_log_pred_test)))
Accuracy of logistic regression after log transformation of salary on test set : 0.8300

8.2.3 By Binning Continous Variables

sns.histplot(data=train.Age)

bins = [train.Age.min()-1, 25, 35, 48, train.Age.max()]
labels = ['Big Kid', 'Young Adult', 'Adult', 'Senior']
train['AgeGroup'] = pd.cut(train["Age"], bins, labels = labels)

#draw a bar plot of Age vs. survival
sns.barplot(x="AgeGroup", y="Purchased", data=train)
plt.show()

train
User ID Gender Age EstimatedSalary Purchased Male log_salary AgeGroup
0 15755018 Male 36 33000 0 True 10.404263 Adult
1 15697020 Female 39 61000 0 False 11.018629 Adult
2 15796351 Male 36 118000 1 True 11.678440 Adult
3 15665760 Male 39 122000 1 True 11.711776 Adult
4 15794661 Female 26 118000 0 False 11.678440 Young Adult
... ... ... ... ... ... ... ... ...
295 15724536 Female 48 96000 1 False 11.472103 Adult
296 15701537 Male 42 149000 1 True 11.911702 Adult
297 15807481 Male 28 79000 0 True 11.277203 Young Adult
298 15603942 Female 51 134000 0 False 11.805595 Senior
299 15690188 Female 33 28000 0 False 10.239960 Young Adult

300 rows × 8 columns

from sklearn import preprocessing 
label_encoder = preprocessing.LabelEncoder() 
train['AgeGroup']= label_encoder.fit_transform(train['AgeGroup']) 
train['AgeGroup'].unique() 
array([0, 3, 1, 2])
test['AgeGroup'] = pd.cut(test["Age"], bins, labels = labels)
test['AgeGroup']= label_encoder.fit_transform(test['AgeGroup']) 
train
User ID Gender Age EstimatedSalary Purchased Male log_salary AgeGroup
0 15755018 Male 36 33000 0 True 10.404263 0
1 15697020 Female 39 61000 0 False 11.018629 0
2 15796351 Male 36 118000 1 True 11.678440 0
3 15665760 Male 39 122000 1 True 11.711776 0
4 15794661 Female 26 118000 0 False 11.678440 3
... ... ... ... ... ... ... ... ...
295 15724536 Female 48 96000 1 False 11.472103 0
296 15701537 Male 42 149000 1 True 11.911702 0
297 15807481 Male 28 79000 0 True 11.277203 3
298 15603942 Female 51 134000 0 False 11.805595 2
299 15690188 Female 33 28000 0 False 10.239960 3

300 rows × 8 columns

# Separating features and target on train set
y_train = train.Purchased
X_train = train.drop(["Purchased", "Gender", "User ID", "EstimatedSalary", "Age"], axis = 1)
X_train
Male log_salary AgeGroup
0 True 10.404263 0
1 False 11.018629 0
2 True 11.678440 0
3 True 11.711776 0
4 False 11.678440 3
... ... ... ...
295 False 11.472103 0
296 True 11.911702 0
297 True 11.277203 3
298 False 11.805595 2
299 False 10.239960 3

300 rows × 3 columns

from sklearn.linear_model import LogisticRegression
sklearn_model_bin = LogisticRegression()
sklearn_model_bin.fit(X_train, y_train)
LogisticRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
# Separating features and target on test set
y_test = test.Purchased
X_test_bin = test.drop(["Purchased", "Gender", "User ID", "EstimatedSalary", "Age"], axis = 1)
X_test_bin
Male log_salary AgeGroup
0 True 9.903488 3
1 False 10.668955 3
2 False 10.950807 3
3 False 11.918391 3
4 True 11.289782 3
... ... ... ...
95 False 10.571317 2
96 True 10.434116 0
97 True 10.645425 2
98 False 10.985293 0
99 True 10.043249 2

100 rows × 3 columns

y_bin_pred_test = sklearn_model_bin.predict(X_test_bin)
print('Accuracy of logistic regression after age binning on test set : {:.4f}'.format(accuracy_score( y_test, y_bin_pred_test)))
Accuracy of logistic regression after age binning on test set : 0.7100

8.3 Reference

  • https://www.linkedin.com/pulse/generating-non-linear-decision-boundaries-using-logistic-d-urso/
  • https://jermwatt.github.io/machine_learning_refined/notes/10_Nonlinear_intro/10_4_Twoclass.html
  • https://www.kaggle.com/code/lzs0047/logistic-regression-non-linear-decision-boundary/edit
  • https://www.kaggle.com/code/ashishrane7/logistic-regression-non-linear-decision-boundary/notebook